# LIBRARIES
library(tidyverse) #Set of packages that work in harmony for data representations and API design (ggplot2, dplyr, readr, etc)
library(sf) #Package for Simple Features, way to encode spatial vector data. Binds to GDAL for reading/writing of data, GEOS for geometrical operations and PROJ for projection conversions/datum transformations (http://strimas.com/r/tidy-sf/)
library(tmap)
library(USAboundaries) #Boundaries for geographical units in the United States of America (U.S. Census Bureau)
library(tidyr) #Use to reshape dataframe
library(lubridate) #Handle dates/time series
library(forcats) #Handle factors
### MONITORING SITES LOCATIONS
#Absolute path of the MS locations in the FTP server
locfile <- 'Data/AirNow/monitoring_site_locations.dat'
#Metadata on MonitoringSiteFactSheet.pdf
fieldnames <- c("AQSID","parameter.name","site.code","site.name","status","agency.id","agency.name","EPA.region","latitude","longitude","elevation","GMT.offset","country.code","CMSA.code","CMSA.name","MSA.code","MSA.name","state.code","state.name","county.code","county.name","city.code","city.name")
#Read the .dat file in a tibble dataframe. Tibbles are modern data frames that keep the useful features of the traditional data frames and drop ones that were frustating (e.g. character vectors to factor, show only columns that fit to the screen when printing, etc.)
MS <- read_delim(locfile, delim='|', col_names = fieldnames, progress = TRUE)
print(MS)
#List of parameters measured
MS.chems <- unique(MS$parameter.name)
print(paste('In total, there are',length(MS.chems),'parameters measured',sep=' '))
[1] "In total, there are 27 parameters measured"
print(MS.chems)
 [1] "O3"       "PM2.5"    "RWD"      "TEMP"     "RHUM"     "SO2"      "RWS"     
 [8] "BARPR"    "PM10"     "WS"       "WD"       "PRECIP"   "NOY"      "NO"      
[15] "SRAD"     "NOX"      "NO2"      "CO"       "NO2Y"     "BC"       "UV-AETH" 
[22] "PMC"      "H2S"      "DEWPOINT" "SO2-15"   "PM2.5-15" "SO4"     
#List of stations
MS.stat <- unique(MS$AQSID)
print(paste('In total, there are',length(MS.stat),'different stations',sep=' '))
[1] "In total, there are 2738 different stations"
#MS locations
#Extract stations measuring ozone (MS.chems=O3) and particulate matter (MS.chems=PM2.5)
MS <- filter(MS,parameter.name %in% c('PM2.5','O3'))
#Convert the data frame to a spatial data frame and project the coordinates in USA Contiguous Albers Equal Area Conic (EPSG:102003)
MS <- st_as_sf(MS, coords = c('longitude','latitude'), agr = 'identity', crs=4326) %>% st_transform(MS, crs=102003) 
#Visualisation of PM2.5 and O3 MS (different colors for active/inactive MS)
tmap_mode("view")
tm_shape(us_counties()) + tm_borders(col='black', lty=2) +
  tm_shape(us_states()) + tm_borders(col='black', lwd=2) +
  tm_shape(MS[MS$parameter.name=='PM2.5',],name='PM2.5 MS locations') +          tm_dots(col='status',palette=c('blue','red')) + 
  tm_shape(MS[MS$parameter.name=='O3',],name='O3 MS locations') + tm_dots(col='status',palette=c('blue','red'),legend.show=FALSE)
#CREATE TWO DATAFRAMES : counties to store all the counties we want to keep, MS to store all the monitoring stations belonging to the counties
#include neighboring counties of Chicago and also Lake and Porter from Indiana (lot of industries)
counties <- us_counties() %>% filter(state_name %in% c('Illinois','Indiana','Wisconsin') & name %in% c('Cook','Lake','DuPage','Will','Porter','Kenosha')) %>% st_transform(counties, crs=102003) 
#Sites measuring O3 and PM2.5 at the 6 counties we're interested in
MS <- MS[which(st_intersects(MS,counties) > 0),]
#Same MS for PM2.5 and O3
MS.both<-group_by(MS,by=AQSID) %>% summarise(length=n()) %>% filter(length==2) %>% .[['by']]
#Plot the counties included in the MS location
ggplot(counties,aes(fill = affgeoid)) + geom_sf()

#Add metadata about the Monitoring Stations (Type, Objectives, ...) from the different states documents (Illinois, Indiana, Wisconsin). Informations for Wisconsin and Indiana will be added manually (only a few stations)
path_docs <- '/Volumes/GoogleDrive/My Drive/PDM/Docs/'
MS <- read_csv(paste0(path_docs,'meta_O3_MS.csv'),col_names = TRUE,na = c("N/A", "NA")) %>% rbind(read_csv(paste0(path_docs,'meta_PM25_MS.csv'),col_names = TRUE,na = c("N/A", "NA"))) %>% mutate(AQSID = gsub(pattern='-',replacement = '',x=AQSID)) %>% merge(x=MS,by=c('AQSID','parameter.name'),all.x=TRUE)
MS <- MS %>% select(AQSID,site.name.x,parameter.name,status,state.name,county.name,primary.objective,secondary.objective,spatial.scale,station.type) %>% rename(site.name=site.name.x)
#READ DATA FOR ALL YEARS
yeardir <- 'Data/AirNow/DailyPeak'
yearfiles <- list.files(yeardir,full.names=TRUE)
fieldnames <- c("vdate","AQSID","site.name","parameter.name","report.units",
                "value","avg.period","agency.name")
#Function to extract the desired data (Ozone-8h and PM2.5-24h) for a given file
extract_data <- function(file) {
  return(read_delim(file, "|", col_names = fieldnames, progress = FALSE) %>% filter(AQSID %in% unique(MS$AQSID)) %>% filter(parameter.name=='OZONE-8HR' | parameter.name=='PM2.5-24hr'))
}
#For loop to go over all the files in the directory
data <- data.frame()
for(j in 1:length(yearfiles)) {
  datadir <- paste0(yeardir,substr(yearfiles[j],nchar(yearfiles[j])-4,nchar(yearfiles[j])))
  datfiles <- list.files(datadir,full.names=TRUE)
  
  for(i in 1:length(datfiles))
  {
  res <- extract_data(datfiles[i])
  data <- rbind(data,res)
  }
}
str(data)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   34181 obs. of  8 variables:
 $ vdate         : chr  "10/22/13" "10/22/13" "10/22/13" "10/22/13" ...
 $ AQSID         : chr  "170310001" "170310001" "170310032" "170310064" ...
 $ site.name     : chr  "ALSIP" "ALSIP" "CHI_SWFP" "CHI_U" ...
 $ parameter.name: chr  "PM2.5-24hr" "OZONE-8HR" "OZONE-8HR" "OZONE-8HR" ...
 $ report.units  : chr  "UG/M3" "PPB" "PPB" "PPB" ...
 $ value         : num  10.2 7 10 7 14 10 16 3.3 11 21 ...
 $ avg.period    : int  24 8 8 8 8 8 8 24 8 8 ...
 $ agency.name   : chr  "Illinois EPA" "Illinois EPA" "Illinois EPA" "Illinois EPA" ...
 - attr(*, "spec")=List of 2
  ..$ cols   :List of 8
  .. ..$ vdate         : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ AQSID         : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ site.name     : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ parameter.name: list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ report.units  : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ value         : list()
  .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
  .. ..$ avg.period    : list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ agency.name   : list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  ..$ default: list()
  .. ..- attr(*, "class")= chr  "collector_guess" "collector"
  ..- attr(*, "class")= chr "col_spec"
#DATA WRANGLING
data.bckp <- data
#data <- data.bckp
data_wrangling <- function(df) {
  #Keep only useful fields (time, AQSID, parameter, values)
  df <- select(df,c('vdate','AQSID','parameter.name','value')) 
  
  #Convert to tidy format (spread parameter name)
  df <- spread(df, key='parameter.name', value='value')
  
  #Change name of columns
  names(df) <- c('date','AQSID','O3','PM25')
  
  #Data conversion
  df$date<-mdy(df$date) #date from char to date
  df$AQSID<-factor(df$AQSID) #AQSID from char to factor
  return(df)
}
#Data wrangling process
data <- data_wrangling(data)
#Make appear missing dates explicitely using complete() from tidyr
data <- complete(data,date = seq.Date(min(date), max(date), by="day"))
#Print implicit missing values
missingD <- filter(data, is.na(O3) & is.na(PM25)) %>% .[['date']] %>% format("%Y%m%d")
print(missingD) #Dates with missing data
 [1] "20151230" "20170818" "20170819" "20170820" "20170821" "20170822" "20170823"
 [8] "20170824" "20170825" "20170826" "20170827" "20170828" "20170829" "20170830"
[15] "20170831" "20170901" "20170902" "20170903" "20170904" "20170905"
 
#Extract missing data in http://files.airnowtech.org/ if possible
extract_missingD <- function(missingD) {
  
  df<-data.frame()
  
  for (i in 1:length(missingD)) {
    
  df_sub <- read.delim(paste0("https://s3-us-west-1.amazonaws.com//files.airnowtech.org/airnow/",substr(missingD[i],1,4),"/",missingD[i],"/daily_data.dat"), sep="|", header=FALSE, col.names = fieldnames) %>% filter(AQSID %in% unique(MS$AQSID)) %>% filter(parameter.name=='OZONE-8HR' | parameter.name=='PM2.5-24hr') #Extract data and filter to only have the wanted MS and parameters (PM2.5/O3)
   
  df <- rbind(df,df_sub)
  }
  
  return(df)
}
#Extract missing data
missingD.data <- extract_missingD(missingD)
#Rearrange the results according to the wrangling process
missingD.data <- data_wrangling(missingD.data)
#Check if the daily_data.dat is daily peak values by using the same function to extract known informations
missingD.test <- sample_n(data, 10) #Take 10 random rows of the initial dataframe
missingD.test.data <- extract_missingD(format(missingD.test$date,"%Y%m%d")) #Extract values on files.airnowtech.org
missingD.test.data<- data_wrangling(missingD.test.data) #Reshape the resulting dataframe
missingD.comp<-inner_join(missingD.test,missingD.test.data,by=c('date','AQSID'))
#print(missingD.comp$O3.x==missingD.comp$O3.y)
#print(missingD.comp$PM25.x==missingD.comp$PM25.y)
#daily_data.dat corresponds to daily peak values
#Bind retrieved missing values to dataframe
data <- data %>% drop_na(AQSID) #Remove the rows added to detect missing values
data <- rbind(data,missingD.data)
#Number of missing values still missing
print(nrow(complete(data,date = seq.Date(min(date), max(date), by="day")))-nrow(data))
[1] 0
#Number of observations we're supposed to have for a complete time series (nb of days between the first and last day as daily measurements)
nbDays<-as.numeric(max(data$date) - min(data$date))

3 cases : - No data - Only seasonal data - Complete time series (more or less)

#EXPLORATORY DATA ANALYSIS of Ozone
MS.O3 <- filter(MS,parameter.name=='O3') %>% select(-parameter.name) #monitoring stations measuring O3
data.O3 <- data %>% filter(AQSID %in% MS.O3[['AQSID']]) %>% subset(select = -PM25) #subset of the dataframe keep only the MS recording O3 (the ones in MS.O3)
data.O3 <- data.O3 %>% droplevels() %>% complete(date,AQSID) #complete the dataframe with the missing combinaisons (data,AQSID) of data in order to make explicite the missing values
MS.O3 <- data.O3 %>% group_by(AQSID) %>% summarize(nb_na = sum(is.na(O3)), mean_val = mean(O3, na.rm=TRUE)) %>% full_join(MS.O3, by='AQSID') #count the nb of missing observations (obs=dailyPeak record) per node
data.O3 %>% ggplot(aes(x=date, y=O3)) + geom_line() + facet_wrap(~AQSID, nrow=5) + ggtitle('2013-2018 time series for the O3 measurements at each node') + labs(x='Date',y='O3 [ppb]') %>% print() #time series for each O3 monitoring station
$x
[1] "Date"

$y
[1] "O3 [ppb]"

attr(,"class")
[1] "labels"

MS.O3 <- MS.O3 %>% mutate(group=ifelse(nb_na < 0.1*nbDays, '>90%', ifelse(nb_na > 0.8*nbDays, '<20%', '20%-90%'))) #create groups according the different type of data frequency as noticed in the time series diagnostic
pal.typeData <- c('<20%'='#f66253','>90%'='#86b24f','20%-90%'='#f6a760') #create a color palette corresponding to these groups
print(MS.O3)
  
# ggplot() + geom_sf(data=counties) + geom_sf(data=MS.O3,aes(color=group)) + scale_colour_manual(values = pal.typeData,na.value='black') + ggtitle('O3 monitoring stations') + labs(colour='Time series completeness') + theme_void() #plot the O3 monitoring stations according the data frequency they have
write.csv(data.O3,'Data/data_O3.csv')
#EXPLORATORY DATA ANALYSIS of PM2.5
MS.PM25 <- filter(MS,parameter.name=='PM2.5') %>% select(-parameter.name) #monitoring stations measuring PM2.5
data.PM25 <- data %>% filter(AQSID %in% MS.PM25[['AQSID']]) %>% subset(select = -O3) #subset of the dataframe keep only the MS recording PM2.5 (the ones in MS.PM25)
data.PM25 <- data.PM25 %>% droplevels() %>% complete(date,AQSID) #complete the dataframe with the missing combinaisons (data,AQSID) of data in order to make explicite the missing values
MS.PM25 <- data.PM25 %>% group_by(AQSID) %>% summarize(nb_na = sum(is.na(PM25))) %>% full_join(MS.PM25, by='AQSID') #count the nb of missing observations (obs=dailyPeak record) per node
print(MS.PM25)
data.PM25 %>% ggplot(aes(x=date, y=PM25)) + geom_line() + facet_wrap(~AQSID, nrow=4) + ggtitle('2013-2018 time series for the PM2.5 measurements at each node') + labs(x='Date',y='PM2.5 [ug/m3]') %>% print() #time series for each PM2.5 monitoring station
$x
[1] "Date"

$y
[1] "PM2.5 [ug/m3]"

attr(,"class")
[1] "labels"

MS.PM25 <- MS.PM25 %>% mutate(group=ifelse(nb_na < 0.1*nbDays, '>90%', ifelse(nb_na > 0.8*nbDays, '<20%', '20%-90%'))) #create groups according the different type of data frequency as noticed in the time series diagnostic
print(MS.PM25)
# ggplot() + geom_sf(data=counties) + geom_sf(data=MS.PM25,aes(color=group)) + scale_colour_manual(values = pal.typeData,na.value='black') + ggtitle('PM2.5 monitoring stations') + labs(colour='Time series completeness') + theme_void() #plot the O3 monitoring stations according the data frequency they have
write.csv(data.PM25,'Data/data_PM25.csv')
tmap_mode("view")
tmap mode set to interactive viewing
popup.vars <- c("AQSID"="AQSID","Site Name"="site.name","State"="state.name","County"="county.name","Primary Objective"="primary.objective","Secondary Objective"="secondary.objective","Spatial Scale"="spatial.scale","Station Type"="station.type")
tm_shape(counties) + tm_borders(col='black', lty=2) +
  tm_shape(st_sf(MS.O3), name='O3 monitoring stations') + tm_dots(col='group',palette=pal.typeData,popup.vars=popup.vars) +
  tm_shape(st_sf(MS.PM25), name='PM2.5 monitoring stations') + tm_dots(col='group',palette=pal.typeData,legend.show=FALSE,popup.vars=popup.vars) + tm_layout(title='Select MS for a specific pollutant in the layer selection')

annotate(“text”, x = -80, y = 35, label=MS.PM25$AQSID[1]) +

What I want

-Forecast of the ambient air pollution (O3, PM2.5) 24h ahead of time -Spatial predictions at unmonitored locations to have a continuous field of O3-PM2.5 predictions for the next day -Mapping of the new AQI -Potential improvement : The Array of Things project

Data at ~15 locations for PM and ~15 locations for O3 and data from 2013 to present.

Use of Ozone-8h measurements and PM2.5-24hr measurements (to allow direct comparisons with the standards values). PM2.5 : 35ug/m3 is the 24-hours standard and 15ug/m3 is the annual standard O3 : 0.075 ppm is the 8-hours standard

Pay attention : -temporal autocorrelation to find the time period to use? -counties to include in the study that can influence the air pollution in the Chicago area? -anisotropic field due to the lake effect -space/time separable or non-separable process? -Bayesian Spatial Predictor (separate spatio-temporal process) or temporal model (ex : SVM) and after co-kriging ?

In the case of BMS, need to split dataset to ungauged and gauged sites so lost of informations.

---
title: "AirNow"
author : "Anaïs Ladoy"
date : "27.04.2018"
output:
  html_document:
    fig_caption: yes
---

```{r setup, include=FALSE, message=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```



```{r, results=FALSE}
# LIBRARIES

library(tidyverse) #Set of packages that work in harmony for data representations and API design (ggplot2, dplyr, readr, etc)

library(sf) #Package for Simple Features, way to encode spatial vector data. Binds to GDAL for reading/writing of data, GEOS for geometrical operations and PROJ for projection conversions/datum transformations (http://strimas.com/r/tidy-sf/)

library(tmap)

library(USAboundaries) #Boundaries for geographical units in the United States of America (U.S. Census Bureau)

library(tidyr) #Use to reshape dataframe

library(lubridate) #Handle dates/time series

library(forcats) #Handle factors

```



```{r, message=FALSE, warning=FALSE}
### MONITORING SITES LOCATIONS

#Absolute path of the MS locations in the FTP server
locfile <- 'Data/AirNow/monitoring_site_locations.dat'

#Metadata on MonitoringSiteFactSheet.pdf
fieldnames <- c("AQSID","parameter.name","site.code","site.name","status","agency.id","agency.name","EPA.region","latitude","longitude","elevation","GMT.offset","country.code","CMSA.code","CMSA.name","MSA.code","MSA.name","state.code","state.name","county.code","county.name","city.code","city.name")

#Read the .dat file in a tibble dataframe. Tibbles are modern data frames that keep the useful features of the traditional data frames and drop ones that were frustating (e.g. character vectors to factor, show only columns that fit to the screen when printing, etc.)
MS <- read_delim(locfile, delim='|', col_names = fieldnames, progress = TRUE)
print(MS)

#List of parameters measured
MS.chems <- unique(MS$parameter.name)
print(paste('In total, there are',length(MS.chems),'parameters measured',sep=' '))
print(MS.chems)

#List of stations
MS.stat <- unique(MS$AQSID)
print(paste('In total, there are',length(MS.stat),'different stations',sep=' '))

```

```{r, message=FALSE, warning=FALSE}
#MS locations

#Extract stations measuring ozone (MS.chems=O3) and particulate matter (MS.chems=PM2.5)
MS <- filter(MS,parameter.name %in% c('PM2.5','O3'))

#Convert the data frame to a spatial data frame and project the coordinates in USA Contiguous Albers Equal Area Conic (EPSG:102003)
MS <- st_as_sf(MS, coords = c('longitude','latitude'), agr = 'identity', crs=4326) %>% st_transform(MS, crs=102003) 

#Visualisation of PM2.5 and O3 MS (different colors for active/inactive MS)
tmap_mode("view")
tm_shape(us_counties()) + tm_borders(col='black', lty=2) +
  tm_shape(us_states()) + tm_borders(col='black', lwd=2) +
  tm_shape(MS[MS$parameter.name=='PM2.5',],name='PM2.5 MS locations') +          tm_dots(col='status',palette=c('blue','red')) + 
  tm_shape(MS[MS$parameter.name=='O3',],name='O3 MS locations') + tm_dots(col='status',palette=c('blue','red'),legend.show=FALSE)

```


```{r, message=FALSE, warning=FALSE}
#CREATE TWO DATAFRAMES : counties to store all the counties we want to keep, MS to store all the monitoring stations belonging to the counties

#include neighboring counties of Chicago and also Lake and Porter from Indiana (lot of industries)

counties <- us_counties() %>% filter(state_name %in% c('Illinois','Indiana','Wisconsin') & name %in% c('Cook','Lake','DuPage','Will','Porter','Kenosha')) %>% st_transform(counties, crs=102003) 

#Sites measuring O3 and PM2.5 at the 6 counties we're interested in
MS <- MS[which(st_intersects(MS,counties) > 0),]

#Same MS for PM2.5 and O3
MS.both<-group_by(MS,by=AQSID) %>% summarise(length=n()) %>% filter(length==2) %>% .[['by']]

#Plot the counties included in the MS location
ggplot(counties,aes(fill = affgeoid)) + geom_sf()

#Add metadata about the Monitoring Stations (Type, Objectives, ...) from the different states documents (Illinois, Indiana, Wisconsin). Informations for Wisconsin and Indiana will be added manually (only a few stations)
path_docs <- '/Volumes/GoogleDrive/My Drive/PDM/Docs/'

MS <- read_csv(paste0(path_docs,'meta_O3_MS.csv'),col_names = TRUE,na = c("N/A", "NA")) %>% rbind(read_csv(paste0(path_docs,'meta_PM25_MS.csv'),col_names = TRUE,na = c("N/A", "NA"))) %>% mutate(AQSID = gsub(pattern='-',replacement = '',x=AQSID)) %>% merge(x=MS,by=c('AQSID','parameter.name'),all.x=TRUE)

MS <- MS %>% select(AQSID,site.name.x,parameter.name,status,state.name,county.name,primary.objective,secondary.objective,spatial.scale,station.type) %>% rename(site.name=site.name.x)

```


```{r, warning=FALSE, message=FALSE}
#READ DATA FOR ALL YEARS

yeardir <- 'Data/AirNow/DailyPeak'
yearfiles <- list.files(yeardir,full.names=TRUE)
fieldnames <- c("vdate","AQSID","site.name","parameter.name","report.units",
                "value","avg.period","agency.name")

#Function to extract the desired data (Ozone-8h and PM2.5-24h) for a given file
extract_data <- function(file) {
  return(read_delim(file, "|", col_names = fieldnames, progress = FALSE) %>% filter(AQSID %in% unique(MS$AQSID)) %>% filter(parameter.name=='OZONE-8HR' | parameter.name=='PM2.5-24hr'))
}

#For loop to go over all the files in the directory
data <- data.frame()

for(j in 1:length(yearfiles)) {
  datadir <- paste0(yeardir,substr(yearfiles[j],nchar(yearfiles[j])-4,nchar(yearfiles[j])))
  datfiles <- list.files(datadir,full.names=TRUE)
  
  for(i in 1:length(datfiles))
  {
  res <- extract_data(datfiles[i])
  data <- rbind(data,res)
  }
}

str(data)
```

```{r, warning=FALSE, message=FALSE}
#DATA WRANGLING
data.bckp <- data
#data <- data.bckp

data_wrangling <- function(df) {

  #Keep only useful fields (time, AQSID, parameter, values)
  df <- select(df,c('vdate','AQSID','parameter.name','value')) 
  
  #Convert to tidy format (spread parameter name)
  df <- spread(df, key='parameter.name', value='value')
  
  #Change name of columns
  names(df) <- c('date','AQSID','O3','PM25')
  
  #Data conversion
  df$date<-mdy(df$date) #date from char to date
  df$AQSID<-factor(df$AQSID) #AQSID from char to factor

  return(df)
}

#Data wrangling process
data <- data_wrangling(data)

#Make appear missing dates explicitely using complete() from tidyr
data <- complete(data,date = seq.Date(min(date), max(date), by="day"))

#Print implicit missing values
missingD <- filter(data, is.na(O3) & is.na(PM25)) %>% .[['date']] %>% format("%Y%m%d")
print(missingD) #Dates with missing data
 
#Extract missing data in http://files.airnowtech.org/ if possible
extract_missingD <- function(missingD) {
  
  df<-data.frame()
  
  for (i in 1:length(missingD)) {
    
  df_sub <- read.delim(paste0("https://s3-us-west-1.amazonaws.com//files.airnowtech.org/airnow/",substr(missingD[i],1,4),"/",missingD[i],"/daily_data.dat"), sep="|", header=FALSE, col.names = fieldnames) %>% filter(AQSID %in% unique(MS$AQSID)) %>% filter(parameter.name=='OZONE-8HR' | parameter.name=='PM2.5-24hr') #Extract data and filter to only have the wanted MS and parameters (PM2.5/O3)
   
  df <- rbind(df,df_sub)
  }
  
  return(df)
}

#Extract missing data
missingD.data <- extract_missingD(missingD)

#Rearrange the results according to the wrangling process
missingD.data <- data_wrangling(missingD.data)

#Check if the daily_data.dat is daily peak values by using the same function to extract known informations
missingD.test <- sample_n(data, 10) #Take 10 random rows of the initial dataframe
missingD.test.data <- extract_missingD(format(missingD.test$date,"%Y%m%d")) #Extract values on files.airnowtech.org
missingD.test.data<- data_wrangling(missingD.test.data) #Reshape the resulting dataframe

missingD.comp<-inner_join(missingD.test,missingD.test.data,by=c('date','AQSID'))
#print(missingD.comp$O3.x==missingD.comp$O3.y)
#print(missingD.comp$PM25.x==missingD.comp$PM25.y)
#daily_data.dat corresponds to daily peak values

#Bind retrieved missing values to dataframe
data <- data %>% drop_na(AQSID) #Remove the rows added to detect missing values
data <- rbind(data,missingD.data)

#Number of missing values still missing
print(nrow(complete(data,date = seq.Date(min(date), max(date), by="day")))-nrow(data))

#Number of observations we're supposed to have for a complete time series (nb of days between the first and last day as daily measurements)
nbDays<-as.numeric(max(data$date) - min(data$date))

```

3 cases :
- No data
- Only seasonal data
- Complete time series (more or less)

```{r, warning=FALSE, message=FALSE}
#EXPLORATORY DATA ANALYSIS of Ozone

MS.O3 <- filter(MS,parameter.name=='O3') %>% select(-parameter.name) #monitoring stations measuring O3
data.O3 <- data %>% filter(AQSID %in% MS.O3[['AQSID']]) %>% subset(select = -PM25) #subset of the dataframe keep only the MS recording O3 (the ones in MS.O3)

data.O3 <- data.O3 %>% droplevels() %>% complete(date,AQSID) #complete the dataframe with the missing combinaisons (data,AQSID) of data in order to make explicite the missing values
MS.O3 <- data.O3 %>% group_by(AQSID) %>% summarize(nb_na = sum(is.na(O3)), mean_val = mean(O3, na.rm=TRUE)) %>% full_join(MS.O3, by='AQSID') #count the nb of missing observations (obs=dailyPeak record) per node

data.O3 %>% ggplot(aes(x=date, y=O3)) + geom_line() + facet_wrap(~AQSID, nrow=5) + ggtitle('2013-2018 time series for the O3 measurements at each node') + labs(x='Date',y='O3 [ppb]') %>% print() #time series for each O3 monitoring station

MS.O3 <- MS.O3 %>% mutate(group=ifelse(nb_na < 0.1*nbDays, '>90%', ifelse(nb_na > 0.8*nbDays, '<20%', '20%-90%'))) #create groups according the different type of data frequency as noticed in the time series diagnostic
pal.typeData <- c('<20%'='#f66253','>90%'='#86b24f','20%-90%'='#f6a760') #create a color palette corresponding to these groups

print(MS.O3)
  
# ggplot() + geom_sf(data=counties) + geom_sf(data=MS.O3,aes(color=group)) + scale_colour_manual(values = pal.typeData,na.value='black') + ggtitle('O3 monitoring stations') + labs(colour='Time series completeness') + theme_void() #plot the O3 monitoring stations according the data frequency they have

write.csv(data.O3,'Data/data_O3.csv')

```

```{r, warning=FALSE, message=FALSE}
#EXPLORATORY DATA ANALYSIS of PM2.5

MS.PM25 <- filter(MS,parameter.name=='PM2.5') %>% select(-parameter.name) #monitoring stations measuring PM2.5
data.PM25 <- data %>% filter(AQSID %in% MS.PM25[['AQSID']]) %>% subset(select = -O3) #subset of the dataframe keep only the MS recording PM2.5 (the ones in MS.PM25)

data.PM25 <- data.PM25 %>% droplevels() %>% complete(date,AQSID) #complete the dataframe with the missing combinaisons (data,AQSID) of data in order to make explicite the missing values
MS.PM25 <- data.PM25 %>% group_by(AQSID) %>% summarize(nb_na = sum(is.na(PM25))) %>% full_join(MS.PM25, by='AQSID') #count the nb of missing observations (obs=dailyPeak record) per node

print(MS.PM25)

data.PM25 %>% ggplot(aes(x=date, y=PM25)) + geom_line() + facet_wrap(~AQSID, nrow=4) + ggtitle('2013-2018 time series for the PM2.5 measurements at each node') + labs(x='Date',y='PM2.5 [ug/m3]') %>% print() #time series for each PM2.5 monitoring station

MS.PM25 <- MS.PM25 %>% mutate(group=ifelse(nb_na < 0.1*nbDays, '>90%', ifelse(nb_na > 0.8*nbDays, '<20%', '20%-90%'))) #create groups according the different type of data frequency as noticed in the time series diagnostic

print(MS.PM25)

# ggplot() + geom_sf(data=counties) + geom_sf(data=MS.PM25,aes(color=group)) + scale_colour_manual(values = pal.typeData,na.value='black') + ggtitle('PM2.5 monitoring stations') + labs(colour='Time series completeness') + theme_void() #plot the O3 monitoring stations according the data frequency they have

write.csv(data.PM25,'Data/data_PM25.csv')

```


```{r}
tmap_mode("view")
popup.vars <- c("AQSID"="AQSID","Site Name"="site.name","State"="state.name","County"="county.name","Primary Objective"="primary.objective","Secondary Objective"="secondary.objective","Spatial Scale"="spatial.scale","Station Type"="station.type")

tm_shape(counties) + tm_borders(col='black', lty=2) +
  tm_shape(st_sf(MS.O3), name='O3 monitoring stations') + tm_dots(col='group',palette=pal.typeData,popup.vars=popup.vars) +
  tm_shape(st_sf(MS.PM25), name='PM2.5 monitoring stations') + tm_dots(col='group',palette=pal.typeData,legend.show=FALSE,popup.vars=popup.vars) + tm_layout(title='Select MS for a specific pollutant in the layer selection')

```



annotate("text", x = -80, y = 35, label=MS.PM25$AQSID[1]) +

##What I want
-Forecast of the ambient air pollution (O3, PM2.5) 24h ahead of time
-Spatial predictions at unmonitored locations to have a continuous field of O3-PM2.5 predictions for the next day
-Mapping of the new AQI
-Potential improvement : The Array of Things project

Data at ~15 locations for PM and ~15 locations for O3 and data from 2013 to present.

Use of Ozone-8h measurements and PM2.5-24hr measurements (to allow direct comparisons with the standards values).
PM2.5 : 35ug/m3 is the 24-hours standard and 15ug/m3 is the annual standard
O3 : 0.075 ppm is the 8-hours standard

Pay attention :
-temporal autocorrelation to find the time period to use?
-counties to include in the study that can influence the air pollution in the Chicago area?
-anisotropic field due to the lake effect
-space/time separable or non-separable process?
-Bayesian Spatial Predictor (separate spatio-temporal process) or temporal model (ex : SVM) and after co-kriging ?

In the case of BMS, need to split dataset to ungauged and gauged sites so lost of informations.

